Overall Scores¶
import pandas as pd
import matplotlib.pyplot as plt
merged_scores = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
overall_scores = merged_scores.melt(
id_vars=["CompletionCode"],
value_vars=["PreOverallScore", "PostOverallScore"],
var_name="Test Phase",
value_name="Score"
)
overall_scores["Test Phase"] = overall_scores["Test Phase"].replace(
{"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)
overall_scores["Score"] = overall_scores["Score"].astype(float) * 100
pretest_mean = overall_scores[overall_scores["Test Phase"] == "Pre-Test"]["Score"].mean()
posttest_mean = overall_scores[overall_scores["Test Phase"] == "Post-Test"]["Score"].mean()
print(f"Pre-Test Mean: {pretest_mean:.2f}%")
print(f"Post-Test Mean: {posttest_mean:.2f}%")
plt.figure(figsize=(8, 6))
plt.boxplot([
overall_scores[overall_scores["Test Phase"] == "Post-Test"]["Score"],
overall_scores[overall_scores["Test Phase"] == "Pre-Test"]["Score"]
], vert=False, labels=["Post-Test", "Pre-Test"])
plt.title("Overall Correctness Scores: Pre-Test vs. Post-Test")
plt.xlabel("Correctness Score (%)")
plt.xlim(0, 100)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()
Pre-Test Mean: 64.56% Post-Test Mean: 67.60%
Stats per Chart Type, Material, and School¶
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
score_columns = [col for col in df.columns if "Score" in col]
df[score_columns] = df[score_columns] * 100
sns.set_style("whitegrid")
color_palette = {
"LineChart": "blue",
"AreaChart": "green",
"StackedAreaChart": "orange",
"Streamgraph": "red"
}
def create_boxplot_fixed_colors(data, x, y, hue=None, order=None, title="", xlabel="", ylabel=""):
plt.figure(figsize=(10, 3.5))
if hue == "Ordered Label":
data["HueGroup"] = data[hue].str.extract(r"(LineChart|AreaChart|StackedAreaChart|Streamgraph)")
ax = sns.boxplot(data=data, x=x, y=y, order=order, hue="HueGroup", palette=color_palette)
else:
ax = sns.boxplot(data=data, x=x, y=y, order=order, hue=hue)
for median in ax.artists:
median.set_edgecolor("black")
median.set_linewidth(1.5)
for line in ax.lines:
if line.get_linestyle() == '-':
line.set_linewidth(2.5)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.xlim(0, 100)
plt.grid(axis='x', linestyle='--', alpha=0.7)
if hue:
plt.legend(title=hue, loc="upper left", fontsize="small")
plt.show()
viz_types = ["LineChart", "AreaChart", "StackedAreaChart", "Streamgraph"]
mean_scores = {
viz: {
"Pre-Test Mean": df[f"Pre{viz}Score"].mean(),
"Post-Test Mean": df[f"Post{viz}Score"].mean(),
}
for viz in viz_types
}
print("Mean Scores for Each Visualization Type:")
for viz, scores in mean_scores.items():
print(f"{viz} Pre-Test: {scores['Pre-Test Mean']:.2f}% | {viz} Post-Test: {scores['Post-Test Mean']:.2f}%")
# Box Plot 1: Overall Pre-Test vs. Post-Test Scores
overall_scores = df.melt(
id_vars=["CompletionCode"],
value_vars=["PreOverallScore", "PostOverallScore"],
var_name="Test Phase",
value_name="Score"
)
overall_scores["Test Phase"] = overall_scores["Test Phase"].replace(
{"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)
create_boxplot_fixed_colors(
overall_scores, "Score", "Test Phase",
title="Overall Correctness Scores: Pre-Test vs. Post-Test",
xlabel="Correctness Score (%)", ylabel="Test Phase"
)
# Box Plot 2: Pre-Test vs. Post-Test per Visualization Type
viz_scores = df.melt(
id_vars=["CompletionCode"],
value_vars=[f"Pre{viz}Score" for viz in viz_types] + [f"Post{viz}Score" for viz in viz_types],
var_name="Test Phase",
value_name="Score"
)
viz_scores["Visualization Type"] = viz_scores["Test Phase"].str.extract(r"(LineChart|AreaChart|StackedAreaChart|Streamgraph)")
viz_scores["Test Phase"] = viz_scores["Test Phase"].str.replace(r"(Pre|Post)(.*)", r"\1-Test", regex=True)
viz_order = [
"LineChart Pre-Test", "LineChart Post-Test",
"AreaChart Pre-Test", "AreaChart Post-Test",
"StackedAreaChart Pre-Test", "StackedAreaChart Post-Test",
"Streamgraph Pre-Test", "Streamgraph Post-Test"
]
viz_scores["Ordered Label"] = viz_scores["Visualization Type"] + " " + viz_scores["Test Phase"]
create_boxplot_fixed_colors(
viz_scores, "Score", "Ordered Label", hue="Ordered Label", order=viz_order,
title="Pre-Test vs. Post-Test Scores per Visualization Type",
xlabel="Correctness Score (%)", ylabel="Visualization Type & Test Phase"
)
# Box Plot 3: Pre-Test vs. Post-Test per Learning Material
group_scores = df.melt(
id_vars=["CompletionCode", "Gruppe"],
value_vars=["PreOverallScore", "PostOverallScore"],
var_name="Test Phase",
value_name="Score"
)
group_scores["Test Phase"] = group_scores["Test Phase"].replace(
{"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)
create_boxplot_fixed_colors(
group_scores, "Score", "Gruppe", hue="Test Phase",
title="Pre-Test vs. Post-Test Scores per Learning Material",
xlabel="Correctness Score (%)", ylabel="Learning Material"
)
# Box Plot 4: Pre-Test vs. Post-Test per School (Test ID)
school_scores = df.melt(
id_vars=["CompletionCode", "Test ID"],
value_vars=["PreOverallScore", "PostOverallScore"],
var_name="Test Phase",
value_name="Score"
)
school_scores["Test Phase"] = school_scores["Test Phase"].replace(
{"PreOverallScore": "Pre-Test", "PostOverallScore": "Post-Test"}
)
create_boxplot_fixed_colors(
school_scores, "Score", "Test ID", hue="Test Phase",
title="Pre-Test vs. Post-Test Scores per School",
xlabel="Correctness Score (%)", ylabel="School"
)
Mean Scores for Each Visualization Type: LineChart Pre-Test: 85.33% | LineChart Post-Test: 81.33% AreaChart Pre-Test: 60.22% | AreaChart Post-Test: 60.44% StackedAreaChart Pre-Test: 54.67% | StackedAreaChart Post-Test: 69.78% Streamgraph Pre-Test: 49.44% | Streamgraph Post-Test: 68.33%
Wilcoxon Signed Rank Test for Pre- and Post Test Scores per Chart Type¶
import pandas as pd
from scipy.stats import wilcoxon
df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
viz_types = ["LineChart", "AreaChart", "StackedAreaChart", "Streamgraph"]
wilcoxon_results = {}
for viz in viz_types:
pre_scores = df[f"Pre{viz}Score"]
post_scores = df[f"Post{viz}Score"]
differences = post_scores - pre_scores
nonzero_indices = differences != 0
stat, p = wilcoxon(pre_scores[nonzero_indices], post_scores[nonzero_indices], alternative='two-sided')
if p < 0.01:
interpretation = "Highly significant improvement" if post_scores.mean() > pre_scores.mean() else "Highly significant decline"
elif p < 0.05:
interpretation = "Significant improvement" if post_scores.mean() > pre_scores.mean() else "Significant decline"
else:
interpretation = "No significant change"
wilcoxon_results[viz] = {"Test Statistic": stat, "p-value": p, "Interpretation": interpretation}
wilcoxon_df = pd.DataFrame.from_dict(wilcoxon_results, orient="index")
print("\nWilcoxon Signed-Rank Test Results:")
print(wilcoxon_df)
Wilcoxon Signed-Rank Test Results:
Test Statistic p-value Interpretation
LineChart 146.5 0.483352 No significant change
AreaChart 295.5 0.986539 No significant change
StackedAreaChart 100.0 0.000060 Highly significant improvement
Streamgraph 78.0 0.001816 Highly significant improvement
Kruskal-Wallis for Material vs Score¶
import pandas as pd
from scipy.stats import kruskal
df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
df["Improvement"] = df["PostOverallScore"] - df["PreOverallScore"]
improvement_groups = [group["Improvement"].values for _, group in df.groupby("Gruppe")]
kw_improvement = kruskal(*improvement_groups)
posttest_groups = [group["PostOverallScore"].values for _, group in df.groupby("Gruppe")]
kw_posttest = kruskal(*posttest_groups)
k = df["Gruppe"].nunique()
N = len(df)
eta_squared_improvement = (kw_improvement.statistic - (k - 1)) / (N - k)
eta_squared_posttest = (kw_posttest.statistic - (k - 1)) / (N - k)
print("Kruskal-Wallis Test on Improvement Scores:")
print(f"H-statistic = {kw_improvement.statistic:.5f}, p-value = {kw_improvement.pvalue:.5f}")
print(f"Effect Size (η²) = {eta_squared_improvement:.5f}\n")
print("Kruskal-Wallis Test on Post-Test Scores:")
print(f"H-statistic = {kw_posttest.statistic:.5f}, p-value = {kw_posttest.pvalue:.5f}")
print(f"Effect Size (η²) = {eta_squared_posttest:.5f}")
Kruskal-Wallis Test on Improvement Scores: H-statistic = 5.71405, p-value = 0.12638 Effect Size (η²) = 0.06620 Kruskal-Wallis Test on Post-Test Scores: H-statistic = 2.52040, p-value = 0.47162 Effect Size (η²) = -0.01170
Dunn's Post-Hoc Test¶
The Kruskal-Wallis test is not significant but there's a small-to-moderate effect size for the improvement scores -> Look which material might have contributed. Dunn's Test for Pairwise Comparisons using Holm-Bonferoni correction to control for multiple comparisons
import pandas as pd
import scikit_posthocs as sp
import numpy as np
from scipy.stats import kruskal
df = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
df["Improvement"] = df["PostOverallScore"] - df["PreOverallScore"]
kw_improvement = kruskal(*[group["Improvement"].values for _, group in df.groupby("Gruppe")])
print(f"Kruskal-Wallis H-statistic = {kw_improvement.statistic:.5f}, p-value = {kw_improvement.pvalue:.5f}")
dunn_results = sp.posthoc_dunn(df, val_col="Improvement", group_col="Gruppe", p_adjust="holm")
print("\nDunn's Test (Pairwise Comparisons of Learning Materials):")
print(dunn_results)
Kruskal-Wallis H-statistic = 5.71405, p-value = 0.12638
Dunn's Test (Pairwise Comparisons of Learning Materials):
Comic Game Schulbuch Video
Comic 1.000000 1.000000 1.000000 0.136068
Game 1.000000 1.000000 1.000000 0.323652
Schulbuch 1.000000 1.000000 1.000000 0.489282
Video 0.136068 0.323652 0.489282 1.000000
Interpretation: No statistically significant difference. Comic performed the lowest and video the highest, hence the smallest p-value, but there's no real effect.
Individual Improvements of Students¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, kruskal
df_new = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
score_columns = ["PreOverallScore", "PostOverallScore"]
df_new[score_columns] = df_new[score_columns] * 100
df_new["Score Change"] = df_new["PostOverallScore"] - df_new["PreOverallScore"]
count_improved = (df_new["Score Change"] > 0).sum()
count_same = (df_new["Score Change"] == 0).sum()
count_declined = (df_new["Score Change"] < 0).sum()
print("Student Performance Summary:")
print(f"Improved: {count_improved}")
print(f"Stayed the Same: {count_same}")
print(f"Declined: {count_declined}\n")
df_new["Performance Group"] = pd.qcut(df_new["PreOverallScore"], q=3, labels=["Low", "Medium", "High"])
group_means_new = df_new.groupby("Performance Group", observed=True)["Score Change"].agg(["mean", "std", "count"])
print("Mean Improvement by Performance Group:")
print(group_means_new, "\n")
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df_new, x="PreOverallScore", y="Score Change", alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.title("Scatterplot of Pre-Test Scores vs. Score Improvement")
plt.xlabel("Pre-Test Score (%)")
plt.ylabel("Score Improvement (%)")
plt.show()
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_new, x="Score Change", y="Performance Group", order=["Low", "Medium", "High"])
plt.axvline(0, color="red", linestyle="--")
plt.title("Score Improvement by Pre-Test Performance Group")
plt.xlabel("Score Improvement (%)")
plt.ylabel("Pre-Test Performance Group")
plt.show()
groups_new = [df_new[df_new["Performance Group"] == g]["Score Change"] for g in df_new["Performance Group"].unique()]
kruskal_stat_new, kruskal_p_new = kruskal(*groups_new)
print("Kruskal-Wallis Test for Performance Group Differences:")
print(f"Statistic: {kruskal_stat_new:.4f}, p-value: {kruskal_p_new:.4f} {'(Significant)' if kruskal_p_new < 0.05 else '(Not Significant)'}\n")
spearman_corr_new, spearman_p_new = spearmanr(df_new["PreOverallScore"], df_new["Score Change"])
print("Spearman Correlation Between Pre-Test Score and Score Improvement:")
print(f"Spearman Correlation: {spearman_corr_new:.4f}, p-value: {spearman_p_new:.4f} {'(Significant)' if spearman_p_new < 0.05 else '(Not Significant)'}\n")
Student Performance Summary:
Improved: 25
Stayed the Same: 6
Declined: 14
Mean Improvement by Performance Group:
mean std count
Performance Group
Low 4.953560 19.646335 17
Medium 8.771930 12.685765 15
High -6.072874 17.168729 13
Kruskal-Wallis Test for Performance Group Differences: Statistic: 5.3180, p-value: 0.0700 (Not Significant) Spearman Correlation Between Pre-Test Score and Score Improvement: Spearman Correlation: -0.2486, p-value: 0.0996 (Not Significant)
Interpretation: Kruskal-Wallis Test shows that differences in improvement of different performance groups are not statistically significant (p = 0.07). It's close to 0.05 though, so there might be a weak trend. Spearman Correlation (p = 0.099, r = -0.249) shows that students with lower pre-test scores might have improved more, but the trend is not statistically significant.
Gender and Age vs. Individual Improvement¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, spearmanr
df_new = pd.read_csv("../FINAL PRE-POST DATASET.csv", sep=';', decimal=',')
score_columns = ["PreOverallScore", "PostOverallScore"]
df_new[score_columns] = df_new[score_columns] * 100
df_new["Score Change"] = df_new["PostOverallScore"] - df_new["PreOverallScore"]
df_new["Geschlecht"] = df_new["Geschlecht"].astype(str)
df_new["Alter"] = pd.to_numeric(df_new["Alter"], errors="coerce")
df_gender_filtered = df_new[df_new["Geschlecht"].isin(["Männlich", "Weiblich"])]
male_scores = df_gender_filtered[df_gender_filtered["Geschlecht"] == "Männlich"]["Score Change"].dropna()
female_scores = df_gender_filtered[df_gender_filtered["Geschlecht"] == "Weiblich"]["Score Change"].dropna()
male_mean = male_scores.mean()
male_std = male_scores.std()
female_mean = female_scores.mean()
female_std = female_scores.std()
print("Mean and Standard Deviation of Score Improvement by Gender:")
print(f"Männlich: Mean = {male_mean:.2f}, SD = {male_std:.2f}")
print(f"Weiblich: Mean = {female_mean:.2f}, SD = {female_std:.2f}\n")
mannwhitney_stat, mannwhitney_p = mannwhitneyu(male_scores, female_scores, alternative="two-sided")
spearman_age_corr, spearman_age_p = spearmanr(df_new["Alter"], df_new["Score Change"], nan_policy="omit")
plt.figure(figsize=(8, 2))
sns.boxplot(data=df_gender_filtered, x="Score Change", y="Geschlecht")
plt.axvline(0, color="red", linestyle="--")
plt.title("Score Improvement by Gender")
plt.xlabel("Score Improvement (%)")
plt.ylabel("Gender")
plt.show()
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df_new, x="Alter", y="Score Change", alpha=0.6)
plt.axhline(0, color="red", linestyle="--")
plt.title("Scatterplot of Age vs. Score Improvement")
plt.xlabel("Age")
plt.ylabel("Score Improvement (%)")
plt.show()
print("Mann-Whitney U Test (Gender Differences in Score Improvement):")
print(f"Statistic: {mannwhitney_stat:.4f}, p-value: {mannwhitney_p:.4f} "
f"{'(Significant)' if mannwhitney_p < 0.05 else '(Not Significant)'}\n")
print("Spearman Correlation (Age vs. Score Improvement):")
print(f"Correlation: {spearman_age_corr:.4f}, p-value: {spearman_age_p:.4f} "
f"{'(Significant)' if spearman_age_p < 0.05 else '(Not Significant)'}\n")
Mean and Standard Deviation of Score Improvement by Gender: Männlich: Mean = -3.16, SD = 17.59 Weiblich: Mean = 11.15, SD = 14.40
Mann-Whitney U Test (Gender Differences in Score Improvement): Statistic: 102.0000, p-value: 0.0047 (Significant) Spearman Correlation (Age vs. Score Improvement): Correlation: -0.0840, p-value: 0.5834 (Not Significant)
Interpretation: Age doesn't matter, but gender makes a highly significant difference!
Demographics¶
import matplotlib.pyplot as plt
import seaborn as sns
# Compute and print gender counts, mean, and standard deviation of age
gender_counts = df_gender_filtered["Geschlecht"].value_counts()
mean_age = df_new["Alter"].mean()
std_age = df_new["Alter"].std()
print("Gender Distribution:")
for gender, count in gender_counts.items():
print(f"{gender}: {count} students")
print(f"\nMean Age: {mean_age:.2f} years")
print(f"Standard Deviation of Age: {std_age:.2f} years\n")
# Plot Age Distribution
plt.figure(figsize=(8, 2))
sns.histplot(df_new["Alter"].dropna(), bins=range(int(df_new["Alter"].min()), int(df_new["Alter"].max()) + 2),
kde=False, color="blue", discrete=True)
plt.xticks(range(int(df_new["Alter"].min()), int(df_new["Alter"].max()) + 1))
plt.title("Age Distribution of Students")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()
# Prepare and plot Gender Distribution
gender_counts_df = gender_counts.reset_index()
gender_counts_df.columns = ["Geschlecht", "Count"]
plt.figure(figsize=(4, 2))
sns.barplot(data=gender_counts_df, x="Geschlecht", y="Count", hue="Geschlecht", palette=["blue", "pink"], legend=False)
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()
Gender Distribution: Männlich: 25 students Weiblich: 17 students Mean Age: 14.22 years Standard Deviation of Age: 1.66 years